import scrapy
import w3lib.html
import html


class ChanspiderSpider(scrapy.Spider):
    name = 'textchan'
    allowed_domains = ['4chan.org']
    start_urls = []

    def extract_text(self, post):
        return html.unescape(w3lib.html.remove_tags(post))

    def parse(self, response):
        thread_number = response.url.split('/')[-1]
        name = response.css('span.name::text').get()
        subject = response.css('span.subject::text').get()
        filename = f'{thread_number}_{name}_{subject}.txt'.replace('/', '_')
        posts = response.css('blockquote.postMessage').getall()
        posts_text = '\n\n'.join([self.extract_text(p) for p in posts])

        yield {
            "filename": filename,
            "posts": posts_text
        }
